Load the required packages
# The initialization settings - Load the required packages
library(tidyverse)
library(mice)
library(coda)
library(car)
library(effects)
Install and load the JAGS package and INLA package
# Install and load the JAGS package
# This code compiles and installs JAGS from the source website and it takes about 6-7 minutes.
system(paste("wget https://sourceforge.net/projects/mcmc-jags/files/JAGS/4.x/Source",
"/JAGS-4.3.0.tar.gz -P /kaggle/working", sep = ""))
system("tar xvfz /kaggle/working/JAGS-4.3.0.tar.gz")
system("cd /kaggle/working/JAGS-4.3.0")
system("/kaggle/working/JAGS-4.3.0/configure")
system("make")
system("make install")
install.packages("rjags", lib = "/kaggle/working")
library(rjags, lib.loc = "/kaggle/working")
# The following code does the full installation of the INLA package.
# It will also take several minutes to complete the installation.
install.packages("INLA", repos = c(getOption("repos"),
INLA = "https://inla.r-inla-download.org/R/stable"),
dep = TRUE, lib = "/kaggle/working")
library(INLA, lib.loc = "/kaggle/working")
Load the GB.rda data set
You first need to upload this data set to the following path on Kaggle:
"../input/great-british/GB.rda"
Perform some initial analyses on the GB data set and arrange this data set to make it suitable for our work
# Load the `GB` data set
load("../input/great-british/GB.rda")
# Display the summary statistics of the orginal data set, `GB`
str(GB)
summary(GB)
# Visualize the distribution of the missing data for the three variables
# about electronic devices, `IC009Q01TA`, `IC009Q02TA` and `IC009Q03TA`
source("https://raw.githubusercontent.com/NErler/JointAI/master/R/md_pattern.R")
source("https://raw.githubusercontent.com/NErler/JointAI/master/R/helpfunctions_melt.R")
md_pattern(GB %>% select(IC009Q01TA, IC009Q02TA, IC009Q03TA) %>%
rename(`Desktop computer` = IC009Q01TA,
`Laptop or notebook` = IC009Q02TA,
`Tablet computer` = IC009Q03TA),
color = c(mdc(1), mdc(2))) +
labs(title = "Missing value patterns of electronic device related variables") +
theme(axis.text.x.top = element_text(angle = 0, hjust = 0.5),
axis.text = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 12))
ggsave("P1F1.png", width = 6, height = 6)
# Arrange the data set `GB` to make it suitable for our work
Great.British = GB %>%
mutate(
MISCED = factor(MISCED, levels = c("0", "1", "2", "3", "4", "5", "6"), ordered = TRUE),
FISCED = factor(FISCED, levels = c("0", "1", "2", "3", "4", "5", "6"), ordered = TRUE),
ST004D01T = factor(ST004D01T, levels = c(1, 2), labels = c("Female", "Male")),
IMMIG = factor(IMMIG, levels = c("1", "2", "3"),
labels = c("Native", "2ndGener", "1stGener")),
TMINS = (TMINS - min(TMINS, na.rm = TRUE))/sd(TMINS, na.rm = TRUE),
Region = factor(Region, levels = c("82611", "82612", "82613", "82620"),
labels = c("England", "NorthIre", "Wales", "Scotland")),
School.Type = factor(ifelse(is.na(SC013Q01TA), ifelse(SCHLTYPE == 3, 1, 2), SC013Q01TA),
levels = c("1", "2"), labels = c("Public", "Private")),
SC001Q01TA = factor(SC001Q01TA, levels = c("1", "2", "3", "4", "5"), ordered = TRUE,
labels = c("Village", "Small.Town", "Town", "City", "Large.City")),
Total.Student = ((SC002Q01TA + SC002Q02TA) - min(SC002Q01TA + SC002Q02TA,
na.rm = TRUE))/sd(SC002Q01TA + SC002Q02TA, na.rm = TRUE),
Girl.Ratio = SC002Q02TA/(SC002Q01TA + SC002Q02TA),
year = factor(year, levels = c("2015", "2018")),
SC048Q03NA = SC048Q03NA/100
) %>%
rename(Gender = ST004D01T, Area.Type = SC001Q01TA,
Deprived = SC048Q03NA, Year = year) %>%
select(!c(CNTSCHID, IC009Q01TA, IC009Q02TA, IC009Q03TA,
SC002Q01TA, SC002Q02TA, SC013Q01TA, SCHLTYPE))
# Display the summary statistics of the data set after collation, `Great.British`
str(Great.British)
summary(Great.British)
(1) Visualize the patterns of missing values in all the variables
(2) Perform imputation for the missing values in the data set, and compare the density or proportional distribution of the observed values with that of imputed values
(3) Complete the missing values to construct a complete data set finally
# Visualize the patterns of missing values in all the variables with missingness
Great.British %>% select(
MISCED, FISCED, ESCS, IMMIG, TMINS, Deprived, Area.Type,
School.Type, Total.Student, Girl.Ratio) %>%
md_pattern(color = c(mdc(1), mdc(2)), print_yaxis = FALSE) +
labs(title = "Missing value patterns of the variables subject to missingness") +
ylab("Distribution of missingness patterns") +
theme(axis.text.x.top = element_text(angle = 0, hjust = 0.5, size = 10),
axis.text.x = element_text(size = 10),
plot.title = element_text(hjust = 0.5))
ggsave("P1F2.png", width = 9, height = 5)
# Perform imputation for the missing values in the data set
impu_dummy = mice(Great.British, maxit = 0)
impu_mat = impu_dummy$predictorMatrix
impu_method = impu_dummy$method
impu_mat[1:15,] = 0
impu_mat[c("Gender", "Region", "Year"),] = 0
impu = mice(Great.British, printFlag = FALSE,
predictorMatrix = impu_mat, seed = 666)
# Compare the density (for continuous variables) or proportional distribution
# (for categorical variables) of the observed values with that of imputed values
densityplot(impu)
source(paste("https://gist.githubusercontent.com/NErler/",
"0d00375da460dd33839b98faeee2fdab/raw/",
"c6f537ecf80eddcefd94992ec7926aa57d454536/propplot.R", sep = ""))
propplot(impu)
# Complete the missing values in the variables `MISCED` and `FISCED`
# Display the summary statistics of the complete data set after imputation
GB_comp = complete(impu, 1)
str(GB_comp)
summary(GB_comp)
(1) I perform some adjustments on the complete data set after imputation to construct the final data set that we will use to fit statistical models.
(2) I randomly select an integer between 1 and 5, and the identifier of the scores for which I will fit statistical models is based on this integer.
# Perform some adjustments on the complete data set after imputation
# to construct the final data set that we will use to fit statistical models
GB.df = GB_comp %>%
mutate(
ISCED = factor(sapply(1:nrow(GB_comp), function(k){max(MISCED[k], FISCED[k])}),
levels = c("0", "1", "2", "3", "4", "5", "6")),
Area.Type = factor(Area.Type, ordered = FALSE)
) %>%
select(!c(MISCED, FISCED))
# Display the summary statistics of the final data set
# that we will use to fit statistical models
data_size = nrow(GB.df)
str(GB.df)
summary(GB.df)
# Fix the random seed
set.seed(1600)
# Randomly pick an integer between 1 and 5, denoting it by `a`
# I will fit statistical models for the scores PV`a`MATH, PV`a`READ and PV`a`SCIE.
a = which(as.logical(rmultinom(n = 1, size = 1, prob = rep(0.2, 5))))
print(a)
math = paste("PV", a, "MATH", sep = "")
reading = paste("PV", a, "READ", sep = "")
science = paste("PV", a, "SCIE", sep = "")
Fit generalized regression models (GLM) and Bayesian models to predict the scores of the three subjects, math, reading and science Perform explanatory data analysis in this part for the statistical models
1. Exploration on the relationship between the response variable and the covariates
Plot the scores of the three subjects against each individual variable
# Plot the math score against each individual variable
layout(matrix(c(1:6), 2, 3, byrow = TRUE))
attach(GB.df)
plot(ESCS, GB.df[, math], col = "deepskyblue4",
xlab = "ESCS", ylab = "Math Score",
main = "The trend of math score with ESCS",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(TMINS, GB.df[, math], col = "deepskyblue4",
xlab = "Learning Time (Scaled)", ylab = "Math Score",
main = "The trend of math score with learning time",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(Deprived, GB.df[, math], col = "deepskyblue4",
xlab = "Percentage", ylab = "Math Score",
main = "The trend of math score with deprivation proportion",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(Total.Student, GB.df[, math], col = "deepskyblue4",
xlab = "Number of Students (Scaled)", ylab = "Math Score",
main = "The trend of math score with number of students",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(Girl.Ratio, GB.df[, math], col = "deepskyblue4",
xlab = "Girl Ratio", ylab = "Math Score",
main = "The trend of math score with gender ratio",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, math] ~ ISCED, xlab = "Education Level", ylab = "Math Score",
main = "Differences in math score caused\nby parental education level",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, math] ~ Gender, xlab = "Gender", ylab = "Math Score",
main = "Differences in math score caused by gender",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, math] ~ IMMIG, xlab = "Immigrant Status", ylab = "Math Score",
main = "Differences in math score caused\nby immigrant status",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, math] ~ Region, xlab = "Region", ylab = "Math Score",
main = "Math scores in different regions",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, math] ~ School.Type, xlab = "School Type", ylab = "Math Score",
main = "Differences in math score caused by school type",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, math] ~ Area.Type, xlab = "Area Type", ylab = "Math Score",
main = "Differences in math score caused by area type",
cex.axis = 1.25, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, math] ~ Year, xlab = "Year", ylab = "Math Score",
main = "Math scores in different years",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
detach(GB.df)
# Plot the reading score against each individual variable
attach(GB.df)
plot(ESCS, GB.df[, reading], col = "deepskyblue4",
xlab = "ESCS", ylab = "Reading Score",
main = "The trend of reading score with ESCS",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(TMINS, GB.df[, reading], col = "deepskyblue4",
xlab = "Learning Time (Scaled)", ylab = "Reading Score",
main = "The trend of reading score with learning time",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(Deprived, GB.df[, reading], col = "deepskyblue4",
xlab = "Percentage", ylab = "Reading Score",
main = "The trend of reading score with deprivation proportion",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(Total.Student, GB.df[, reading], col = "deepskyblue4",
xlab = "Number of Students (Scaled)", ylab = "Reading Score",
main = "The trend of reading score with number of students",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(Girl.Ratio, GB.df[, reading], col = "deepskyblue4",
xlab = "Girl Ratio", ylab = "Reading Score",
main = "The trend of reading score with gender ratio",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, reading] ~ ISCED, xlab = "Education Level", ylab = "Reading Score",
main = "Differences in reading score caused\nby parental education level",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, math] ~ Gender, xlab = "Gender", ylab = "Reading Score",
main = "Differences in reading score caused by gender",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, reading] ~ IMMIG, xlab = "Immigrant Status", ylab = "Reading Score",
main = "Differences in reading score caused\nby immigrant status",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, reading] ~ Region, xlab = "Region", ylab = "Reading Score",
main = "Reading scores in different regions",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, reading] ~ School.Type, xlab = "School Type", ylab = "Reading Score",
main = "Differences in reading score caused by school type",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, reading] ~ Area.Type, xlab = "Area Type", ylab = "Reading Score",
main = "Differences in reading score caused by area type",
cex.axis = 1.25, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, reading] ~ Year, xlab = "Year", ylab = "Reading Score",
main = "Reading scores in different years",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
detach(GB.df)
# Plot the science score against each individual variable
attach(GB.df)
plot(ESCS, GB.df[, science], col = "deepskyblue4",
xlab = "ESCS", ylab = "Science Score",
main = "The trend of science score with ESCS",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(TMINS, GB.df[, science], col = "deepskyblue4",
xlab = "Learning Time (Scaled)", ylab = "Science Score",
main = "The trend of science score with learning time",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(Deprived, GB.df[, science], col = "deepskyblue4",
xlab = "Percentage", ylab = "Science Score",
main = "The trend of science score with deprivation proportion",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(Total.Student, GB.df[, science], col = "deepskyblue4",
xlab = "Number of Students (Scaled)", ylab = "Science Score",
main = "The trend of science score with number of students",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(Girl.Ratio, GB.df[, science], col = "deepskyblue4",
xlab = "Girl Ratio", ylab = "Science Score",
main = "The trend of science score with gender ratio",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, science] ~ ISCED, xlab = "Education Level", ylab = "Science Score",
main = "Differences in science score caused\nby parental education level",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, science] ~ Gender, xlab = "Gender", ylab = "Science Score",
main = "Differences in science score caused by gender",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, science] ~ IMMIG, xlab = "Immigrant Status", ylab = "Science Score",
main = "Differences in science score caused\nby immigrant status",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, science] ~ Region, xlab = "Region", ylab = "Science Score",
main = "Science scores in different regions",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, science] ~ School.Type, xlab = "School Type", ylab = "Science Score",
main = "Differences in science score caused by school type",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, science] ~ Area.Type, xlab = "Area Type", ylab = "Science Score",
main = "Differences in science score caused by area type",
cex.axis = 1.25, cex.lab = 1.5, cex.main = 1.5)
boxplot(formula = GB.df[, science] ~ Year, xlab = "Year", ylab = "Science Score",
main = "Science scores in different years",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
detach(GB.df)
graphics.off()
Use the boxTidwell function in car package to check the linearity between the response and the covariates, and then transform some variables to enhance the linear relationship
# Use the `boxTidwell` function in `car` package to improve the linear
# relationship between the response variable and the covariates
# The operation on the math subject
print("The operation on the math subject")
boxTidwell(get(math) ~ I(Deprived + 10^(-6)), data = GB.df)
boxTidwell(get(math) ~ exp(-Deprived), data = GB.df)
boxTidwell(get(math) ~ I(Total.Student + 10^(-6)), data = GB.df, max.iter = 100)
# The operation on the reading subject
print("The operation on the reading subject")
boxTidwell(get(reading) ~ I(Deprived + 10^(-6)), data = GB.df)
boxTidwell(get(reading) ~ exp(-Deprived), data = GB.df)
boxTidwell(get(reading) ~ I(Total.Student + 10^(-6)), data = GB.df, max.iter = 100)
# The operation on the science subject
print("The operation on the science subject")
boxTidwell(get(science) ~ I(Deprived + 10^(-6)), data = GB.df)
boxTidwell(get(science) ~ exp(-Deprived), data = GB.df)
boxTidwell(get(science) ~ I(Total.Student + 10^(-6)), data = GB.df, max.iter = 1000)
# Transform some variables to enhance the linear relationship
# Plot the math score against transformed variables
layout(matrix(c(1:6), 3, 2, byrow = TRUE))
attach(GB.df)
plot(sqrt(Deprived), GB.df[,math], col = "deepskyblue4",
xlab = "Percentage (Square Root)", ylab = "Math Score",
main = "The trend of math score with\nthe square root of deprivation proportion",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(exp(-3*Deprived), GB.df[,math], col = "deepskyblue4",
xlab = "Percentage (Exponential Transformed)", ylab = "Math Score",
main = "The trend of math score with exponential\ntransformed deprivation proportion",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
detach(GB.df)
# Transform some variables to enhance the linear relationship
# Plot the reading score against transformed variables
attach(GB.df)
plot(sqrt(Deprived), GB.df[,reading], col = "deepskyblue4",
xlab = "Percentage (Square Root)", ylab = "Reading Score",
main = "The trend of reading score with\nthe square root of deprivation proportion",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(exp(-3*Deprived), GB.df[,reading], col = "deepskyblue4",
xlab = "Percentage (Exponential Transformed)", ylab = "Reading Score",
main = "The trend of reading score with exponential\ntransformed deprivation proportion",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
detach(GB.df)
# Transform some variables to enhance the linear relationship
# Plot the science score against transformed variables
attach(GB.df)
plot(sqrt(Deprived), GB.df[,science], col = "deepskyblue4",
xlab = "Percentage (Square Root)", ylab = "Science Score",
main = "The trend of science score with\nthe square root of deprivation proportion",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
plot(exp(-3*Deprived), GB.df[,science], col = "deepskyblue4",
xlab = "Percentage (Exponential Transformed)", ylab = "Science Score",
main = "The trend of science score with exponential\ntransformed deprivation proportion",
cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
detach(GB.df)
graphics.off()
2. Perform model selection to find the best model for each subject
Model selection process for Math subject
# Model selection for Math subject: Step one
formula.math1 = "get(math) ~ ISCED + ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + Area.Type + Girl.Ratio"
fit.math1 = lm(formula.math1, data = GB.df)
summary(fit.math1)
anova(fit.math1)
print(paste("The AIC of the GLM model is:", AIC(fit.math1)))
vif(fit.math1)
# Model selection for Math subject: Step two
formula.math2 = "get(math) ~ ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + Area.Type + ISCED"
fit.math2 = lm(formula.math2, data = GB.df)
summary(fit.math2)
anova(fit.math2)
print(paste("The AIC of the GLM model is:", AIC(fit.math2)))
# Visualize the relationship between the math scores and some intersection terms
attach(GB.df)
names = c("Nat.Eng", "2ndG.Eng", "1stG.Eng", "Nat.NorIre", "2ndG.NorIre", "1stG.NorIre",
"Nat.Wales", "2ndG.Wales", "1stG.Wales", "Nat.Scot", "2ndG.Scot", "1stG.Scot")
boxplot(formula = GB.df[, math] ~ IMMIG*Region, xaxt = "n",
xlab = "", ylab = "Math Score", cex.main = 1.2,
main = "Differences in math score caused by\nimmigrant status and region")
axis(side = 1, at = 1:12, labels = rep("", 12))
text(x = 1:12 - 0.6, y = 40, labels = names, xpd = TRUE, srt = 45)
title(xlab = "Immigrant Status & Region", line = 4, cex.lab = 1.2)
names = c("Pub.Eng", "Pri.Eng", "Pub.NorIre", "Pri.NorIre",
"Pub.Wales", "Pri.Wales", "Pub.Scot", "Pri.Scot")
boxplot(formula = GB.df[, math] ~ School.Type*Region, xaxt = "n",
xlab = "", ylab = "Math Score", cex.main = 1.2,
main = "Differences in math score caused by\nschool type and region")
axis(side = 1, at = 1:8, labels = rep("", 8))
text(x = 1:8 - 0.4, y = 40, labels = names, xpd = TRUE, srt = 45)
title(xlab = "School Type & Region", line = 4, cex.lab = 1.2)
plot(ESCS*TMINS, GB.df[, math], col = "blue",
xlab = "ESCS * Learning Time", ylab = "Math Score", cex.main = 1.2,
main = "The trend of math score with ESCS multiplying learning time")
detach(GB.df)
# Model selection for Math subject: Step three
formula.math3 = "get(math) ~ ISCED + ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + Area.Type + IMMIG:Region"
fit.math3 = lm(formula.math3, data = GB.df)
summary(fit.math3)
anova(fit.math3)
print(paste("The AIC of the GLM model is:", AIC(fit.math3)))
# Model selection for Math subject: Step four
formula.math4 = "get(math) ~ ISCED + ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + Area.Type + IMMIG:Region + School.Type:Region"
fit.math4 = lm(formula.math4, data = GB.df)
summary(fit.math4)
anova(fit.math4)
print(paste("The AIC of the GLM model is:", AIC(fit.math4)))
# Model selection for Math subject: Step five
formula.math5 = "get(math) ~ ISCED + ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + Area.Type + IMMIG:Region +
School.Type:Region + ESCS:TMINS"
fit.math5 = lm(formula.math5, data = GB.df)
summary(fit.math5)
anova(fit.math5)
print(paste("The AIC of the GLM model is:", AIC(fit.math5)))
# Check the model assumptions
par(mfrow = c(2, 2))
plot(rstudent(fit.math5), ylab = "Standardized Residuals",
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2,
main = "Standardized residuals vs Index", col = "green4")
plot(fit.math5, col = "green4", sub = "", cex.main = 1.2,
cex.lab = 1.15, cex.axis = 1.2, lwd = 2, which = 1:3)
graphics.off()
Model selection process for Reading subject
# Model selection for Reading subject: Step one
formula.reading1 = "get(reading) ~ ISCED + ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
I(sqrt(Total.Student)) + Area.Type + Girl.Ratio"
fit.reading1 = lm(formula.reading1, data = GB.df)
summary(fit.reading1)
anova(fit.reading1)
print(paste("The AIC of the GLM model is:", AIC(fit.reading1)))
# Model selection for Reading subject: Step two
formula.reading2 = "get(reading) ~ ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
I(sqrt(Total.Student)) + Area.Type + ISCED"
fit.reading2 = lm(formula.reading2, data = GB.df)
summary(fit.reading2)
anova(fit.reading2)
print(paste("The AIC of the GLM model is:", AIC(fit.reading2)))
# Visualize the relationship between the reading scores and some intersection terms
attach(GB.df)
names = c("Nat.Eng", "2ndG.Eng", "1stG.Eng", "Nat.NorIre", "2ndG.NorIre", "1stG.NorIre",
"Nat.Wales", "2ndG.Wales", "1stG.Wales", "Nat.Scot", "2ndG.Scot", "1stG.Scot")
boxplot(formula = GB.df[, reading] ~ IMMIG*Region, xaxt = "n",
xlab = "", ylab = "Reading Score", cex.main = 1.2,
main = "Differences in reading score caused by\nimmigrant status and region")
axis(side = 1, at = 1:12, labels = rep("", 12))
text(x = 1:12 - 0.6, y = 50, labels = names, xpd = TRUE, srt = 45)
title(xlab = "Immigrant Status & Region", line = 4, cex.lab = 1.2)
names = c("Pub.Eng", "Pri.Eng", "Pub.NorIre", "Pri.NorIre",
"Pub.Wales", "Pri.Wales", "Pub.Scot", "Pri.Scot")
boxplot(formula = GB.df[, reading] ~ School.Type*Region, xaxt = "n",
xlab = "", ylab = "Reading Score", cex.main = 1.2,
main = "Differences in reading score caused by\nschool type and region")
axis(side = 1, at = 1:8, labels = rep("", 8))
text(x = 1:8 - 0.4, y = 50, labels = names, xpd = TRUE, srt = 45)
title(xlab = "School Type & Region", line = 4, cex.lab = 1.2)
plot(ESCS*TMINS, GB.df[, reading], col = "blue",
xlab = "ESCS * Learning Time", ylab = "Reading Score", cex.main = 1.2,
main = "The trend of reading score with ESCS multiplying learning time")
detach(GB.df)
# Model selection for Reading subject: Step three
formula.reading3 = "get(reading) ~ ISCED + ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
I(sqrt(Total.Student)) + Area.Type + IMMIG:Region"
fit.reading3 = lm(formula.reading3, data = GB.df)
summary(fit.reading3)
anova(fit.reading3)
print(paste("The AIC of the GLM model is:", AIC(fit.reading3)))
# Model selection for Reading subject: Step four
formula.reading4 = "get(reading) ~ ISCED + ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
I(sqrt(Total.Student)) + Area.Type + IMMIG:Region + School.Type:Region"
fit.reading4 = lm(formula.reading4, data = GB.df)
summary(fit.reading4)
anova(fit.reading4)
print(paste("The AIC of the GLM model is:", AIC(fit.reading4)))
# Model selection for Reading subject: Step five
formula.reading5 = "get(reading) ~ ISCED + ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
I(sqrt(Total.Student)) + Area.Type +
IMMIG:Region + School.Type:Region + ESCS:TMINS"
fit.reading5 = lm(formula.reading5, data = GB.df)
summary(fit.reading5)
anova(fit.reading5)
print(paste("The AIC of the GLM model is:", AIC(fit.reading5)))
# Check the model assumptions
par(mfrow = c(2, 2))
plot(rstudent(fit.reading5), ylab = "Standardized Residuals",
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2,
main = "Standardized residuals vs Index", col = "green4")
plot(fit.reading5, col = "green4", sub = "", cex.main = 1.2,
cex.lab = 1.15, cex.axis = 1.2, lwd = 2, which = 1:3)
graphics.off()
Model selection process for Science subject
# Model selection for Science subject: Step one
formula.science1 = "get(science) ~ ISCED + ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + Area.Type + Girl.Ratio"
fit.science1 = lm(formula.science1, data = GB.df)
summary(fit.science1)
anova(fit.science1)
print(paste("The AIC of the GLM model is:", AIC(fit.science1)))
# Model selection for Science subject: Step two
formula.science2 = "get(science) ~ ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + Area.Type + ISCED"
fit.science2 = lm(formula.science2, data = GB.df)
summary(fit.science2)
anova(fit.science2)
print(paste("The AIC of the GLM model is:", AIC(fit.science2)))
# Visualize the relationship between the science scores and some intersection terms
attach(GB.df)
names = c("Nat.Eng", "2ndG.Eng", "1stG.Eng", "Nat.NorIre", "2ndG.NorIre", "1stG.NorIre",
"Nat.Wales", "2ndG.Wales", "1stG.Wales", "Nat.Scot", "2ndG.Scot", "1stG.Scot")
boxplot(formula = GB.df[, science] ~ IMMIG*Region, xaxt = "n",
xlab = "", ylab = "Science Score", cex.main = 1.2,
main = "Differences in science score caused by\nimmigrant status and region")
axis(side = 1, at = 1:12, labels = rep("", 12))
text(x = 1:12 - 0.6, y = 65, labels = names, xpd = TRUE, srt = 45)
title(xlab = "Immigrant Status & Region", line = 4, cex.lab = 1.2)
names = c("Pub.Eng", "Pri.Eng", "Pub.NorIre", "Pri.NorIre",
"Pub.Wales", "Pri.Wales", "Pub.Scot", "Pri.Scot")
boxplot(formula = GB.df[, science] ~ School.Type*Region, xaxt = "n",
xlab = "", ylab = "Science Score", cex.main = 1.2,
main = "Differences in science score caused by\nschool type and region")
axis(side = 1, at = 1:8, labels = rep("", 8))
text(x = 1:8 - 0.4, y = 65, labels = names, xpd = TRUE, srt = 45)
title(xlab = "School Type & Region", line = 4, cex.lab = 1.2)
plot(ESCS*TMINS, GB.df[,science], col = "blue",
xlab = "ESCS * Learning Time", ylab = "Science Score", cex.main = 1.2,
main = "The trend of science score with ESCS multiplying learning time")
detach(GB.df)
# Model selection for Science subject: Step three
formula.science3 = "get(science) ~ ISCED + ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + Area.Type + IMMIG*Region"
fit.science3 = lm(formula.science3, data = GB.df)
summary(fit.science3)
anova(fit.science3)
print(paste("The AIC of the GLM model is:", AIC(fit.science3)))
# Model selection for Science subject: Step four
formula.science4 = "get(science) ~ ISCED + ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + Area.Type + IMMIG:Region + School.Type*Region"
fit.science4 = lm(formula.science4, data = GB.df)
summary(fit.science4)
anova(fit.science4)
print(paste("The AIC of the GLM model is:", AIC(fit.science4)))
# Model selection for Science subject: Step five
formula.science5 = "get(science) ~ ISCED + ESCS + Gender + IMMIG +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + Area.Type +
IMMIG:Region + School.Type*Region + ESCS:TMINS"
fit.science5 = lm(formula.science5, data = GB.df)
summary(fit.science5)
anova(fit.science5)
print(paste("The AIC of the GLM model is:", AIC(fit.science5)))
# Check the model assumptions
par(mfrow = c(2, 2))
plot(rstudent(fit.science5), ylab = "Standardized Residuals",
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2,
main = "Standardized residuals vs Index", col = "green4")
plot(fit.science5, col = "green4", sub = "", cex.main = 1.2,
cex.lab = 1.15, cex.axis = 1.2, lwd = 2, which = 1:3)
graphics.off()
Explore the nature of the influence of the associated factors on PISA results one after another
# Explore the effects of ESCS and learning time (TMINS)
print("1. The effects of ESCS and learning time (TMINS):")
# The effect of ESCS
print("(1) The effect of ESCS:")
summary(lm(get(math) ~ ESCS, data = GB.df))
summary(lm(get(reading) ~ ESCS, data = GB.df))
summary(lm(get(science) ~ ESCS, data = GB.df))
print(paste("2.5% quantile of ESCS:", quantile(GB.df$ESCS, 0.025)))
print(paste("97.5% quantile of ESCS:", quantile(GB.df$ESCS, 0.975)))
Subject = c("Math", "Reading", "Science")
names(Subject) = c("PV2MATH", "PV2READ", "PV2SCIE")
GB.df %>% select(PV2MATH, PV2READ, PV2SCIE, ESCS) %>%
pivot_longer(c(PV2MATH, PV2READ, PV2SCIE), names_to = "Subject", values_to = "Score") %>%
ggplot(aes(x = ESCS, y = Score)) + geom_point(color = "deepskyblue3") +
geom_smooth(method = "lm", fullrange = TRUE, color = "red") +
facet_wrap(~Subject, ncol = 3, labeller = labeller(Subject = Subject)) +
labs(y = "PISA Score", title = "Trend of PISA scores with ESCS") +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
strip.text = element_text(size = 12, face = "bold"))
ggsave("P3F1.png", width = 8, height = 4)
# The effect of learning time (TMINS)
print("(2) The effect of learning time (TMINS):")
summary(lm(get(math) ~ TMINS, data = GB.df))
summary(lm(get(reading) ~ TMINS, data = GB.df))
summary(lm(get(science) ~ TMINS, data = GB.df))
print(paste("Standard deviation of the variable \"TMINS\"",
"in the original data set:", sd(GB$TMINS, na.rm = TRUE)))
GB.df %>% select(PV2MATH, PV2READ, PV2SCIE, TMINS) %>%
pivot_longer(c(PV2MATH, PV2READ, PV2SCIE), names_to = "Subject", values_to = "Score") %>%
ggplot(aes(x = TMINS, y = Score)) + geom_point(color = "deepskyblue3") +
geom_smooth(method = "lm", fullrange = TRUE, color = "red") +
facet_wrap(~Subject, ncol = 3, labeller = labeller(Subject = Subject)) +
labs(x = "Learning Time (Scaled)", y = "PISA Score",
title = "Trend of PISA scores with learning time") +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
strip.text = element_text(size = 12, face = "bold"))
ggsave("P3F2.png", width = 8, height = 4)
# Exploration on the intersection of ESCS and learning time (TMINS)
print("(3) The intersection of ESCS and learning time (TMINS):")
fit.ESCSTime.math = lm(get(math) ~ ESCS*TMINS, data = GB.df)
fit.ESCSTime.read = lm(get(reading) ~ ESCS*TMINS, data = GB.df)
fit.ESCSTime.scie = lm(get(science) ~ ESCS*TMINS, data = GB.df)
summary(fit.ESCSTime.math); summary(fit.ESCSTime.read); summary(fit.ESCSTime.scie)
ESCS.level = quantile(GB.df$ESCS, probs = c(0, 0.25, 0.5, 0.75, 1))
TMINS.level = seq(0, 10, 2)
Effect.ESCSTime = rbind(
as.data.frame(effect("ESCS*TMINS", fit.ESCSTime.math,
xlevels = list(ESCS = ESCS.level, TMINS = TMINS.level), typical = mean)),
as.data.frame(effect("ESCS*TMINS", fit.ESCSTime.read,
xlevels = list(ESCS = ESCS.level, TMINS = TMINS.level), typical = mean)),
as.data.frame(effect("ESCS*TMINS", fit.ESCSTime.scie,
xlevels = list(ESCS = ESCS.level, TMINS = TMINS.level), typical = mean))) %>%
mutate(Subject = c(rep("Math", 30), rep("Reading", 30), rep("Science", 30)),
ESCS = factor(ESCS), TMINS = factor(TMINS))
Effect.ESCSTime %>% ggplot(aes(x = TMINS, y = fit)) +
geom_line(aes(color = ESCS, group = ESCS)) +
facet_wrap(~Subject, ncol = 3) +
geom_ribbon(aes(ymin = lower, ymax = upper, group = ESCS),
color = "gray", fill = "gray", alpha = 0.3) +
labs(x = "Learning Time (Scaled)", y = "PISA Score",
title = paste("Effect of learning time on PISA scores of\n",
"students with different ESCS levels", sep = "")) +
scale_color_discrete(name = "ESCS Level",
labels = paste(as.character(seq(0, 100, 25)), "% quantile", sep = "")) +
guides(color = guide_legend(reverse = TRUE)) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 14),
legend.title = element_text(size = 14), legend.text = element_text(size = 12))
ggsave("P3F3.png", width = 8, height = 5)
# Explore the effect of parental education level (ISCED)
print("2. The effect of parental education level (ISCED):")
fit.ISCED.math = lm(get(math) ~ ISCED, data = GB.df)
fit.ISCED.read = lm(get(reading) ~ ISCED, data = GB.df)
fit.ISCED.scie = lm(get(science) ~ ISCED, data = GB.df)
summary(fit.ISCED.math); summary(fit.ISCED.read); summary(fit.ISCED.scie)
Effect.ISCED = rbind(
as.data.frame(effect("ISCED", fit.ISCED.math)),
as.data.frame(effect("ISCED", fit.ISCED.read)),
as.data.frame(effect("ISCED", fit.ISCED.scie))) %>%
mutate(Subject = c(rep("Math", 7), rep("Reading", 7), rep("Science", 7)))
Effect.ISCED %>% ggplot(aes(x = ISCED, y = fit)) +
geom_tile(aes(height = (upper - lower)), fill = "deeppink", alpha = 0.8) +
geom_text(aes(label = round(fit, 0)), size = 4) +
facet_wrap(~Subject, ncol = 3) +
labs(x = "ISCED Level", y = "PISA Score",
title = "Trend of PISA scores with parental education level") +
scale_x_discrete(labels = paste("ISCED", as.character(0:6), sep = "")) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 14))
ggsave("P3F4.png", width = 8, height = 5)
# Explore the effect of gender
print("3. The effect of gender:")
summary(lm(get(math) ~ Gender, data = GB.df))
summary(lm(get(reading) ~ Gender, data = GB.df))
summary(lm(get(science) ~ Gender, data = GB.df))
# Explore the effects of immigrant status (IMMIG) and school type
print("4. The effects of immigrant status (IMMIG) and school type:")
# The effect of immigrant status (IMMIG)
print("(1) The effect of immigrant status (IMMIG):")
summary(lm(get(math) ~ IMMIG, data = GB.df))
summary(lm(get(reading) ~ IMMIG, data = GB.df))
summary(lm(get(science) ~ IMMIG, data = GB.df))
# The effect of school type
print("(2) The effect of school type:")
summary(lm(get(math) ~ School.Type, data = GB.df))
summary(lm(get(reading) ~ School.Type, data = GB.df))
summary(lm(get(science) ~ School.Type, data = GB.df))
# Exploration on the intersection of immigrant status (IMMIG) and region
print("(3) The intersection of immigrant status (IMMIG) and region:")
fit.IMMIGReg.math = lm(get(math) ~ IMMIG*Region, data = GB.df)
fit.IMMIGReg.read = lm(get(reading) ~ IMMIG*Region, data = GB.df)
fit.IMMIGReg.scie = lm(get(science) ~ IMMIG*Region, data = GB.df)
Effect.IMMIGReg = rbind(
as.data.frame(effect("IMMIG*Region", fit.IMMIGReg.math)),
as.data.frame(effect("IMMIG*Region", fit.IMMIGReg.read)),
as.data.frame(effect("IMMIG*Region", fit.IMMIGReg.scie))) %>%
mutate(Subject = c(rep("Math", 12), rep("Reading", 12), rep("Science", 12)),
Region = factor(Region, levels = c("England", "NorthIre", "Wales", "Scotland")))
Effect.IMMIGReg %>% ggplot(aes(x = Region, y = fit)) +
geom_line(aes(color = IMMIG, group = IMMIG)) +
facet_wrap(~Subject, ncol = 3) +
geom_ribbon(aes(ymin = lower, ymax = upper, group = IMMIG),
color = "gray", fill = "gray", alpha = 0.3) +
labs(x = "Region", y = "PISA Score",
title = "Trend of PISA scores with immigrant status and region") +
scale_color_discrete(name = "Immigrant status",
labels = c("1st Generation", "2nd Generation", "Native")) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 14),
legend.title = element_text(size = 14), legend.text = element_text(size = 12))
ggsave("P3F5.png", width = 8, height = 5)
# Exploration on the intersection of school type and region
print("(4) The intersection of school type and region:")
fit.STReg.math = lm(get(math) ~ School.Type*Region, data = GB.df)
fit.STReg.read = lm(get(reading) ~ School.Type*Region, data = GB.df)
fit.STReg.scie = lm(get(science) ~ School.Type*Region, data = GB.df)
Effect.STReg = rbind(
as.data.frame(effect("School.Type*Region", fit.STReg.math)),
as.data.frame(effect("School.Type*Region", fit.STReg.read)),
as.data.frame(effect("School.Type*Region", fit.STReg.scie))) %>%
mutate(Subject = c(rep("Math", 8), rep("Reading", 8), rep("Science", 8)),
Region = factor(Region, levels = c("England", "NorthIre", "Wales", "Scotland")))
Effect.STReg %>% ggplot(aes(x = Region, y = fit)) +
geom_line(aes(color = School.Type, group = School.Type)) +
facet_wrap(~Subject, ncol = 3) +
geom_ribbon(aes(ymin = lower, ymax = upper, group = School.Type),
color = "gray", fill = "gray", alpha = 0.3) +
labs(x = "Region", y = "PISA Score",
title = "Trend of PISA scores with school type and region") +
scale_color_discrete(name = "School Type") +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 14),
legend.title = element_text(size = 12), legend.text = element_text(size = 12))
ggsave("P3F6.png", width = 8, height = 5)
# Explore the effect of the proportion of deprived students in the school
print("5. The effect of the proportion of deprived students in the school:")
summary(lm(get(math) ~ I(exp(-3*Deprived)), data = GB.df))
summary(lm(get(reading) ~ I(exp(-3*Deprived)), data = GB.df))
summary(lm(get(science) ~ I(exp(-3*Deprived)), data = GB.df))
# Explore the effect of the scale of schools and communities
print("6. The effect of the scale of schools and communities:")
# The effect of the scale of schools, the total number of students in the school
print(paste("(1) The effect of the scale of schools",
"(the total number of students in the school):"))
summary(lm(get(math) ~ Total.Student, data = GB.df))
summary(lm(get(reading) ~ Total.Student, data = GB.df))
summary(lm(get(science) ~ Total.Student, data = GB.df))
print(paste("Standard deviation of the variable \"Total.Student\":",
sd(GB$SC002Q01TA + GB$SC002Q02TA, na.rm = TRUE)))
# The effect of the scale of the area where the school locates
print("(2) The effect of the scale of the area where the school locates:")
fit.area.math = lm(get(math) ~ Area.Type, data = GB.df)
fit.area.read = lm(get(reading) ~ Area.Type, data = GB.df)
fit.area.scie = lm(get(science) ~ Area.Type, data = GB.df)
summary(fit.area.math); summary(fit.area.read); summary(fit.area.scie)
Effect.Area = rbind(
as.data.frame(effect("Area.Type", fit.area.math)),
as.data.frame(effect("Area.Type", fit.area.read)),
as.data.frame(effect("Area.Type", fit.area.scie))) %>%
mutate(Subject = c(rep("Math", 5), rep("Reading", 5), rep("Science", 5)),
Area.Type = factor(Area.Type,
levels = c("Village", "Small.Town", "Town", "City", "Large.City"),
labels = c("Village", "Small Town", "Town", "City", "Large City")))
Effect.Area %>% ggplot(aes(x = Area.Type, y = fit)) +
geom_tile(aes(height = (upper - lower)), fill = "deeppink", alpha = 0.8) +
geom_text(aes(label = round(fit, 1)), size = 4) +
facet_wrap(~Subject, ncol = 3) +
labs(x = "Area Type", y = "PISA Score",
title = "Trend of PISA scores with the scale of communities") +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 14))
ggsave("P3F7.png", width = 8, height = 5)
In this section, I am going to compare the PISA results between the UK nations (England, Northern Ireland, Wales and Scotland), and analyze the changes of PISA results from 2015 to 2018.
Create boxplots to visualize the relationship between PISA scores and the intersection of the variable Region and Year
# Prepare the labels for each class
Labels = c("England15", "England18", "NorthIre15", "NorthIre18",
"Wales15", "Wales18", "Scotland15", "Scotland18")
# Create boxplots to visualize the relationship between PISA scores and
# the intersection of the variable `Region` and `Year`
GB.df %>% pivot_longer(
cols = c(PV2MATH, PV2READ, PV2SCIE), names_to = "Subject", values_to = "Score") %>%
ggplot(aes(x = Region:Year, y = Score)) + geom_boxplot(aes(fill = Year)) +
facet_wrap(~Subject, ncol = 3, labeller = labeller(Subject =
c("PV2MATH" = "Math", "PV2READ" = "Reading", "PV2SCIE" = "Science"))) +
labs(x = "Region & Year", y = "PISA Score",
title = "PISA scores in different regions and years") +
scale_x_discrete(labels = Labels) +
scale_y_continuous(breaks = seq(200, 800, 200)) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 11),
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14),
legend.title = element_text(size = 12), legend.text = element_text(size = 12),
legend.position = "top", strip.text = element_text(size = 12, face = "bold"))
ggsave("P4F1.png", width = 8, height = 5)
1. Statistical inference based on normal samples
(1) Build confidence intervals for the PISA score of each subject in each group
(2) Compare the PISA score of each subject between the four nations of the UK and analyze the changes of the score from 2015 to 2018
# Construct the function to perform the two-sample T-test for normal data
T.TwoSample = function(sample1, sample2, alpha = 0.05){
"
This function is used to perform the two-sample T-test for normal data.
Input: sample1 (numeric): The first sample of normal data
sample2 (numeric): The second sample of normal data
alpha (numeric, between 0 and 1): The significance level of the T-test
Output: Result (data frame): The data frame that contains the information and
the results of the two-sample T-test
"
# Calculate the test statistic and some other values
n1 = length(sample1); n2 = length(sample2)
mean1 = mean(sample1); mean2 = mean(sample2)
S1 = sd(sample1); S2 = sd(sample2)
Sp = sqrt(((n1 - 1)*S1^2 + (n2 - 1)*S2^2)/(n1 + n2 - 2))
T = (mean1 - mean2)/Sp/sqrt(1/n1 + 1/n2)
Df = n1 + n2 - 2
# Compare the test statistic with critical values
if(T > qt(1 - alpha, df = Df)){
Type = "One.sided"
p = 1 - pt(T, df = Df)
Result = "Greater than"
}
else if(T < qt(alpha, df = Df)){
Type = "One.sided"
p = pt(T, df = Df)
Result = "Less than"
}
else{
Type = "Two.sided"
p = 2*(1 - pt(abs(T), df = Df))
Result = "Similar"
}
# Organize the results of the two-sample T-test into a data frame
Result = data.frame(
T.TestStat = T, Df = Df, alpha = alpha, Type = Type,
p.value = p, Result = Result
)
return(Result)
}
# Construct the function to build the confidence interval for the mean of a normal sample
T.OneSample = function(sample, alpha = 0.05){
"
This function is used to build the confidence interval for the mean of a normal sample.
Input: sample (numeric): The sample of normal data that we need to build a confidence
interval for its mean
alpha (numeric, between 0 and 1): The significance level of the confidence interval
Output: Confid (data frame): The data frame that contains the information about the
confidence interval that we build
"
# Calculate the confidence interval for the mean of the normal sample
n = length(sample)
mean = mean(sample)
S = sd(sample)
lower = mean - S*qt(1 - alpha/2, df = n - 1)/sqrt(n)
upper = mean + S*qt(1 - alpha/2, df = n - 1)/sqrt(n)
Confid = data.frame("lower" = lower, "mean" = mean, "upper" = upper)
return(Confid)
}
# Analyze the changes of the PISA score of each subject in the UK from 2015 to 2018
# and build confidence intervals for the score of each group
# Prepare the data
Math15 = (GB.df %>% filter(Year == 2015) %>% select(PV2MATH))[, 1]
Math18 = (GB.df %>% filter(Year == 2018) %>% select(PV2MATH))[, 1]
Reading15 = (GB.df %>% filter(Year == 2015) %>% select(PV2READ))[, 1]
Reading18 = (GB.df %>% filter(Year == 2018) %>% select(PV2READ))[, 1]
Science15 = (GB.df %>% filter(Year == 2015) %>% select(PV2SCIE))[, 1]
Science18 = (GB.df %>% filter(Year == 2018) %>% select(PV2SCIE))[, 1]
# Build confidence intervals for the score of each group
Result.Year = cbind(data.frame(
Subject = c(rep("Math", 2), rep("Reading", 2), rep("Science", 2)),
Year = factor(rep(c(2015, 2018), 3))),
rbind(T.OneSample(Math15), T.OneSample(Math18), T.OneSample(Reading15),
T.OneSample(Reading18), T.OneSample(Science15), T.OneSample(Science18)))
Result.Year %>% ggplot(aes(x = Year, y = mean)) +
geom_tile(aes(height = (upper - lower)),
fill = "darkturquoise", alpha = 0.8) +
geom_text(aes(label = round(mean, 2)), size = 6) +
facet_wrap(~Subject, ncol = 3) +
labs(y = "PISA Score",
title = "PISA scores of the three subjects in 2015 and 2018") +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 11),
strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 14))
ggsave("P4F2.png", width = 8, height = 5)
# Analyze the changes of the overall PISA score of each subject
# in the UK from 2015 to 2018
print("Comparison between the year 2015 and 2018:")
print("The math subject:")
T.TwoSample(Math15, Math18)
print("The reading subject:")
T.TwoSample(Reading15, Reading18)
print("The science subject:")
T.TwoSample(Science15, Science18)
# Compare the PISA score of each subject between the four nations of the UK
# and build confidence intervals for the score of each group
# Prepare the data
Eng.math = (GB.df %>% filter(Region == "England") %>% select(PV2MATH))[, 1]
NorIre.math = (GB.df %>% filter(Region == "NorthIre") %>% select(PV2MATH))[, 1]
Wales.math = (GB.df %>% filter(Region == "Wales") %>% select(PV2MATH))[, 1]
Scot.math = (GB.df %>% filter(Region == "Scotland") %>% select(PV2MATH))[, 1]
Eng.reading = (GB.df %>% filter(Region == "England") %>% select(PV2READ))[, 1]
NorIre.reading = (GB.df %>% filter(Region == "NorthIre") %>% select(PV2READ))[, 1]
Wales.reading = (GB.df %>% filter(Region == "Wales") %>% select(PV2READ))[, 1]
Scot.reading = (GB.df %>% filter(Region == "Scotland") %>% select(PV2READ))[, 1]
Eng.science = (GB.df %>% filter(Region == "England") %>% select(PV2SCIE))[, 1]
NorIre.science = (GB.df %>% filter(Region == "NorthIre") %>% select(PV2SCIE))[, 1]
Wales.science = (GB.df %>% filter(Region == "Wales") %>% select(PV2SCIE))[, 1]
Scot.science = (GB.df %>% filter(Region == "Scotland") %>% select(PV2SCIE))[, 1]
# Build confidence intervals for the score of each group
Result.Region = cbind(data.frame(
Subject = c(rep("Math", 4), rep("Reading", 4), rep("Science", 4)),
Region = factor(rep(c("England", "NorthIre", "Wales", "Scotland"), 3),
levels = c("England", "NorthIre", "Wales", "Scotland")),
rbind(T.OneSample(Eng.math), T.OneSample(NorIre.math),
T.OneSample(Wales.math), T.OneSample(Scot.math),
T.OneSample(Eng.reading), T.OneSample(NorIre.reading),
T.OneSample(Wales.reading), T.OneSample(Scot.reading),
T.OneSample(Eng.science), T.OneSample(NorIre.science),
T.OneSample(Wales.science), T.OneSample(Scot.science))))
Result.Region %>% ggplot(aes(x = Region, y = mean)) +
geom_tile(aes(height = (upper - lower)),
fill = "darkturquoise", alpha = 0.8) +
geom_text(aes(label = round(mean, 2)), size = 4) +
facet_wrap(~Subject, ncol = 3) +
labs(y = "PISA Score",
title = "PISA scores of the three subjects in different UK regions") +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 11),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 14))
ggsave("P4F3.png", width = 8, height = 5)
# Compare the PISA score of each subject between the four nations of the UK
print("Comparison among the four nations of the UK:")
print("The math subject:")
T.TwoSample(Eng.math, NorIre.math)
T.TwoSample(NorIre.math, Scot.math)
T.TwoSample(Scot.math, Wales.math)
print("The reading subject:")
T.TwoSample(Eng.reading, NorIre.reading)
T.TwoSample(NorIre.reading, Scot.reading)
T.TwoSample(Scot.reading, Wales.reading)
print("The science subject:")
T.TwoSample(Eng.science, NorIre.science)
T.TwoSample(NorIre.science, Scot.science)
T.TwoSample(Scot.science, Wales.science)
# Compare the PISA score of each subject between the four nations of the UK and
# analyze the changes of the score from 2015 to 2018, and build confidence intervals
# for the score of each group
# Prepare the data
Eng15.math = (GB.df %>% filter(Year == 2015, Region == "England") %>% select(PV2MATH))[, 1]
Eng18.math = (GB.df %>% filter(Year == 2018, Region == "England") %>% select(PV2MATH))[, 1]
NorIre15.math = (GB.df %>% filter(Year == 2015, Region == "NorthIre") %>% select(PV2MATH))[, 1]
NorIre18.math = (GB.df %>% filter(Year == 2018, Region == "NorthIre") %>% select(PV2MATH))[, 1]
Wales15.math = (GB.df %>% filter(Year == 2015, Region == "Wales") %>% select(PV2MATH))[, 1]
Wales18.math = (GB.df %>% filter(Year == 2018, Region == "Wales") %>% select(PV2MATH))[, 1]
Scot15.math = (GB.df %>% filter(Year == 2015, Region == "Scotland") %>% select(PV2MATH))[, 1]
Scot18.math = (GB.df %>% filter(Year == 2018, Region == "Scotland") %>% select(PV2MATH))[, 1]
Eng15.read = (GB.df %>% filter(Year == 2015, Region == "England") %>% select(PV2READ))[, 1]
Eng18.read = (GB.df %>% filter(Year == 2018, Region == "England") %>% select(PV2READ))[, 1]
NorIre15.read = (GB.df %>% filter(Year == 2015, Region == "NorthIre") %>% select(PV2READ))[, 1]
NorIre18.read = (GB.df %>% filter(Year == 2018, Region == "NorthIre") %>% select(PV2READ))[, 1]
Wales15.read = (GB.df %>% filter(Year == 2015, Region == "Wales") %>% select(PV2READ))[, 1]
Wales18.read = (GB.df %>% filter(Year == 2018, Region == "Wales") %>% select(PV2READ))[, 1]
Scot15.read = (GB.df %>% filter(Year == 2015, Region == "Scotland") %>% select(PV2READ))[, 1]
Scot18.read = (GB.df %>% filter(Year == 2018, Region == "Scotland") %>% select(PV2READ))[, 1]
Eng15.scie = (GB.df %>% filter(Year == 2015, Region == "England") %>% select(PV2SCIE))[, 1]
Eng18.scie = (GB.df %>% filter(Year == 2018, Region == "England") %>% select(PV2SCIE))[, 1]
NorIre15.scie = (GB.df %>% filter(Year == 2015, Region == "NorthIre") %>% select(PV2SCIE))[, 1]
NorIre18.scie = (GB.df %>% filter(Year == 2018, Region == "NorthIre") %>% select(PV2SCIE))[, 1]
Wales15.scie = (GB.df %>% filter(Year == 2015, Region == "Wales") %>% select(PV2SCIE))[, 1]
Wales18.scie = (GB.df %>% filter(Year == 2018, Region == "Wales") %>% select(PV2SCIE))[, 1]
Scot15.scie = (GB.df %>% filter(Year == 2015, Region == "Scotland") %>% select(PV2SCIE))[, 1]
Scot18.scie = (GB.df %>% filter(Year == 2018, Region == "Scotland") %>% select(PV2SCIE))[, 1]
# Build confidence intervals for the score of each group
Res.YearReg = cbind(data.frame(
Subject = c(rep("Math", 8), rep("Reading", 8), rep("Science", 8)),
Year = factor(rep(c(2015, 2018), 12)),
Region = factor(rep(c(rep("England", 2), rep("NorthIre", 2),
rep("Wales", 2), rep("Scotland", 2)), 3),
levels = c("England", "NorthIre", "Wales", "Scotland")),
rbind(T.OneSample(Eng15.math), T.OneSample(Eng18.math),
T.OneSample(NorIre15.math), T.OneSample(NorIre18.math),
T.OneSample(Wales15.math), T.OneSample(Wales18.math),
T.OneSample(Scot15.math), T.OneSample(Scot18.math),
T.OneSample(Eng15.read), T.OneSample(Eng18.read),
T.OneSample(NorIre15.read), T.OneSample(NorIre18.read),
T.OneSample(Wales15.read), T.OneSample(Wales18.read),
T.OneSample(Scot15.read), T.OneSample(Scot18.read),
T.OneSample(Eng15.scie), T.OneSample(Eng18.scie),
T.OneSample(NorIre15.scie), T.OneSample(NorIre18.scie),
T.OneSample(Wales15.scie), T.OneSample(Wales18.scie),
T.OneSample(Scot15.scie), T.OneSample(Scot18.scie))))
Res.YearReg %>% ggplot(aes(x = Region:Year, y = mean)) +
geom_tile(aes(height = (upper - lower)),
fill = "darkturquoise", alpha = 0.8) +
geom_text(aes(label = round(mean, 1)), size = 4, angle = 45) +
facet_wrap(~Subject, ncol = 3) +
labs(x = "Region & Year", y = "PISA Score",
title = "PISA scores of the three subjects in different UK regions and years") +
scale_x_discrete(labels = Labels) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 11),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 14))
ggsave("P4F4.png", width = 8, height = 6)
# Compare the PISA score of each subject between the four nations of the UK and
# analyze the changes of the score from 2015 to 2018
print("Comparison between the year 2015 and 2018 and among the four nations of the UK:")
print("1. The comparison between the year 2015 and 2018:")
print("(1) The math subject:")
T.TwoSample(Eng15.math, Eng18.math)
T.TwoSample(NorIre15.math, NorIre18.math)
T.TwoSample(Wales15.math, Wales18.math)
T.TwoSample(Scot15.math, Scot18.math)
print("(2) The reading subject:")
T.TwoSample(Eng15.read, Eng18.read)
T.TwoSample(NorIre15.read, NorIre18.read)
T.TwoSample(Wales15.read, Wales18.read)
T.TwoSample(Scot15.read, Scot18.read)
print("(3) The science subject:")
T.TwoSample(Eng15.scie, Eng18.scie)
T.TwoSample(NorIre15.scie, NorIre18.scie)
T.TwoSample(Wales15.scie, Wales18.scie)
T.TwoSample(Scot15.scie, Scot18.scie)
print("2. The comparison among the four nations of the UK:")
print("(1) The math subject:")
T.TwoSample(Eng15.math, NorIre15.math)
T.TwoSample(NorIre15.math, Scot15.math)
T.TwoSample(Scot15.math, Wales15.math)
T.TwoSample(Eng18.math, NorIre18.math)
T.TwoSample(NorIre18.math, Scot18.math)
T.TwoSample(Scot18.math, Wales18.math)
print("(2) The reading subject:")
T.TwoSample(Eng15.read, NorIre15.read)
T.TwoSample(NorIre15.read, Scot15.read)
T.TwoSample(Scot15.read, Wales15.read)
T.TwoSample(Eng18.read, NorIre18.read)
T.TwoSample(NorIre18.read, Scot18.read)
T.TwoSample(Eng18.read, Scot18.read)
T.TwoSample(Scot18.read, Wales18.read)
print("(3) The science subject:")
T.TwoSample(Eng15.scie, NorIre15.scie)
T.TwoSample(NorIre15.scie, Scot15.scie)
T.TwoSample(Scot15.scie, Wales15.scie)
T.TwoSample(Eng18.scie, NorIre18.scie)
T.TwoSample(NorIre18.scie, Scot18.scie)
T.TwoSample(Scot18.scie, Wales18.scie)
T.TwoSample(NorIre18.scie, Wales18.scie)
2. The analyses based on generalized linear models (GLMs)
Use GLMs to analyze the difference of PISA scores between the UK nations and the changes from 2015 to 2018, i.e. studying the effects of the variable Region and Year in the regression model
# The generalized linear model for the math subject
fit.YearReg.math = lm(get(math) ~ Year + Region + Year:Region, data = GB.df)
summary(fit.YearReg.math)
par(mfrow = c(2, 2))
plot(rstudent(fit.YearReg.math), ylab = "Standardized Residuals",
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2,
main = "Standardized residuals vs Index", col = "green4")
plot(fit.YearReg.math, col = "green4", sub = "", cex.main = 1.2,
cex.lab = 1.15, cex.axis = 1.2, lwd = 2, which = 1:3)
Effect.math = as.data.frame(effect("Year*Region", fit.YearReg.math))
# The generalized linear model for the reading subject
fit.YearReg.reading = lm(get(reading) ~ Year + Region + Year:Region, data = GB.df)
summary(fit.YearReg.reading)
plot(rstudent(fit.YearReg.reading), ylab = "Standardized Residuals",
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2,
main = "Standardized residuals vs Index", col = "green4")
plot(fit.YearReg.reading, col = "green4", sub = "", cex.main = 1.2,
cex.lab = 1.15, cex.axis = 1.2, lwd = 2, which = 1:3)
Effect.reading = as.data.frame(effect("Year*Region", fit.YearReg.reading))
# The generalized linear model for the science subject
fit.YearReg.science = lm(get(science) ~ Year + Region + Year:Region, data = GB.df)
summary(fit.YearReg.science)
plot(rstudent(fit.YearReg.science), ylab = "Standardized Residuals",
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2,
main = "Standardized residuals vs Index", col = "green4")
plot(fit.YearReg.science, col = "green4", sub = "", cex.main = 1.2,
cex.lab = 1.15, cex.axis = 1.2, lwd = 2, which = 1:3)
Effect.science = as.data.frame(effect("Year*Region", fit.YearReg.science))
# Visualize the difference of PISA scores between the UK nations and the changes
# from 2015 to 2018, i.e. the effects of the variable `Region` and `Year`
Effects = rbind(Effect.math, Effect.reading, Effect.science) %>%
mutate(Subject = c(rep("Math", 8), rep("Reading", 8), rep("Science", 8)))
Effects %>% ggplot(aes(x = Year, y = fit, color = Region)) +
geom_line(aes(group = Region)) + facet_wrap(~Subject, ncol = 3) +
geom_ribbon(aes(ymin = lower, ymax = upper, group = Region),
color = "gray", fill = "gray", alpha = 0.5) +
labs(y = "PISA Score", title = "Effects of region and year on PISA scores") +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 14),
legend.title = element_text(size = 12), legend.position = "top",
legend.text = element_text(size = 12))
ggsave("P4F5.png", width = 8, height = 5)
graphics.off()
3. The analyses based on Bayesian modelling
(1) Construct Bayesian linear regression models for every subject and sample from the posterior distributions of the model parameters
(2) Compute and visualize the posterior predictive samples (density) of the average PISA score of each subject for each group of data
# Specify priors for regression parameters (coefficients) and the standard
# deviation of the response variable
prec.prior = list(prec = list(prior = "logtnormal", param = c(100, 0.01)))
prior.beta = list(mean.intercept = 0, prec.intercept = 0.001,
mean = 0, prec = 0.001)
# Construct the Bayesian linear regression model for math subject and
# sample from the posterior distributions of the model parameters
Math.inla.model = inla(formula = get(math) ~ ISCED + ESCS + Gender +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + School.Type:Region + IMMIG:Region + ESCS:TMINS,
data = GB.df, family = "gaussian", control.compute = list(config = TRUE),
control.fixed = prior.beta, control.family = list(hyper = prec.prior))
summary(Math.inla.model)
nsamp = 3000; set.seed(1000)
inla.seed = as.integer(runif(1)*.Machine$integer.max)
Math.inla.sample = inla.posterior.sample(
n = nsamp, result = Math.inla.model, seed = inla.seed, num.threads = "1:1")
Math.predictor = data.frame(
inla.posterior.sample.eval(function(...){Predictor}, Math.inla.sample))
Math.sigma = 1/sqrt(inla.posterior.sample.eval(function(...){theta}, Math.inla.sample))
gc(rm(Math.inla.sample))
# Construct the Bayesian linear regression model for reading subject and
# sample from the posterior distributions of the model parameters
Read.inla.model = inla(formula = get(reading) ~ ISCED + ESCS + Gender +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + School.Type:Region + IMMIG:Region + ESCS:TMINS,
data = GB.df, family = "gaussian", control.compute = list(config = TRUE),
control.fixed = prior.beta, control.family = list(hyper = prec.prior))
summary(Read.inla.model)
nsamp = 3000; set.seed(1500)
inla.seed = as.integer(runif(1)*.Machine$integer.max)
Read.inla.sample = inla.posterior.sample(
n = nsamp, result = Read.inla.model, seed = inla.seed, num.threads = "1:1")
Read.predictor = data.frame(
inla.posterior.sample.eval(function(...){Predictor}, Read.inla.sample))
Read.sigma = 1/sqrt(inla.posterior.sample.eval(function(...){theta}, Read.inla.sample))
gc(rm(Read.inla.sample))
# Construct the Bayesian linear regression model for science subject and
# sample from the posterior distributions of the model parameters
Scie.inla.model = inla(formula = get(science) ~ ISCED + ESCS + Gender +
TMINS + Region + I(exp(-3*Deprived)) + Year + School.Type +
Total.Student + School.Type:Region + IMMIG:Region + ESCS:TMINS,
data = GB.df, family = "gaussian", control.compute = list(config = TRUE),
control.fixed = prior.beta, control.family = list(hyper = prec.prior))
summary(Scie.inla.model)
nsamp = 3000; set.seed(2000)
inla.seed = as.integer(runif(1)*.Machine$integer.max)
Scie.inla.sample = inla.posterior.sample(
n = nsamp, result = Scie.inla.model, seed = inla.seed, num.threads = "1:1")
Scie.predictor = data.frame(
inla.posterior.sample.eval(function(...){Predictor}, Scie.inla.sample))
Scie.sigma = 1/sqrt(inla.posterior.sample.eval(function(...){theta}, Scie.inla.sample))
gc(rm(Scie.inla.sample))
# Create the data frames that will be used to record the results
Subject = c(rep("Math", nsamp), rep("Reading", nsamp), rep("Science", nsamp))
Year.aver = data.frame(
Subject = Subject, Year15 = numeric(3*nsamp), Year18 = numeric(3*nsamp))
Region.aver = data.frame(
Subject = Subject, England = numeric(3*nsamp), NorthIre = numeric(3*nsamp),
Wales = numeric(3*nsamp), Scotland = numeric(3*nsamp))
YearReg.aver = data.frame(
Subject = Subject, England_15 = numeric(3*nsamp), England_18 = numeric(3*nsamp),
NorthIre_15 = numeric(3*nsamp), NorthIre_18 = numeric(3*nsamp),
Wales_15 = numeric(3*nsamp), Wales_18 = numeric(3*nsamp),
Scotland_15 = numeric(3*nsamp), Scotland_18 = numeric(3*nsamp))
# Calculate some parameters
Year.n1 = sum(GB.df$Year == 2015)
Year.n2 = sum(GB.df$Year == 2018)
Region.n1 = sum(GB.df$Region == "England")
Region.n2 = sum(GB.df$Region == "NorthIre")
Region.n3 = sum(GB.df$Region == "Wales")
Region.n4 = sum(GB.df$Region == "Scotland")
YearReg.n1 = sum((GB.df$Year == 2015) & (GB.df$Region == "England"))
YearReg.n2 = sum((GB.df$Year == 2018) & (GB.df$Region == "England"))
YearReg.n3 = sum((GB.df$Year == 2015) & (GB.df$Region == "NorthIre"))
YearReg.n4 = sum((GB.df$Year == 2018) & (GB.df$Region == "NorthIre"))
YearReg.n5 = sum((GB.df$Year == 2015) & (GB.df$Region == "Wales"))
YearReg.n6 = sum((GB.df$Year == 2018) & (GB.df$Region == "Wales"))
YearReg.n7 = sum((GB.df$Year == 2015) & (GB.df$Region == "Scotland"))
YearReg.n8 = sum((GB.df$Year == 2018) & (GB.df$Region == "Scotland"))
# Compute the posterior predictive samples of the average PISA score
# of each subject for each group
attach(GB.df)
for(i in 1:nsamp){
# Compute the posterior predictive samples of the average PISA score of the
# whole UK in the year 2015 and 2018 respectively for each subject
Year.aver[i, "Year15"] = mean(rnorm(n = Year.n1,
mean = Math.predictor[Year == 2015, i], sd = Math.sigma[, i]))
Year.aver[i, "Year18"] = mean(rnorm(n = Year.n2,
mean = Math.predictor[Year == 2018, i], sd = Math.sigma[, i]))
Year.aver[i + nsamp, "Year15"] = mean(rnorm(n = Year.n1,
mean = Read.predictor[Year == 2015, i], sd = Read.sigma[, i]))
Year.aver[i + nsamp, "Year18"] = mean(rnorm(n = Year.n2,
mean = Read.predictor[Year == 2018, i], sd = Read.sigma[, i]))
Year.aver[i + 2*nsamp, "Year15"] = mean(rnorm(n = Year.n1,
mean = Scie.predictor[Year == 2015, i], sd = Scie.sigma[, i]))
Year.aver[i + 2*nsamp, "Year18"] = mean(rnorm(n = Year.n2,
mean = Scie.predictor[Year == 2018, i], sd = Scie.sigma[, i]))
# Compute the posterior predictive samples of the average PISA score of each region in
# the UK respectively (combining the data of the year 2015 and 2018) for each subject
Region.aver[i, "England"] = mean(rnorm(n = Region.n1,
mean = Math.predictor[Region == "England", i], sd = Math.sigma[, i]))
Region.aver[i, "NorthIre"] = mean(rnorm(n = Region.n2,
mean = Math.predictor[Region == "NorthIre", i], sd = Math.sigma[, i]))
Region.aver[i, "Wales"] = mean(rnorm(n = Region.n3,
mean = Math.predictor[Region == "Wales", i], sd = Math.sigma[, i]))
Region.aver[i, "Scotland"] = mean(rnorm(n = Region.n4,
mean = Math.predictor[Region == "Scotland", i], sd = Math.sigma[, i]))
Region.aver[i + nsamp, "England"] = mean(rnorm(n = Region.n1,
mean = Read.predictor[Region == "England", i], sd = Read.sigma[, i]))
Region.aver[i + nsamp, "NorthIre"] = mean(rnorm(n = Region.n2,
mean = Read.predictor[Region == "NorthIre", i], sd = Read.sigma[, i]))
Region.aver[i + nsamp, "Wales"] = mean(rnorm(n = Region.n3,
mean = Read.predictor[Region == "Wales", i], sd = Read.sigma[, i]))
Region.aver[i + nsamp, "Scotland"] = mean(rnorm(n = Region.n4,
mean = Read.predictor[Region == "Scotland", i], sd = Read.sigma[, i]))
Region.aver[i + 2*nsamp, "England"] = mean(rnorm(n = Region.n1,
mean = Scie.predictor[Region == "England", i], sd = Scie.sigma[, i]))
Region.aver[i + 2*nsamp, "NorthIre"] = mean(rnorm(n = Region.n2,
mean = Scie.predictor[Region == "NorthIre", i], sd = Scie.sigma[, i]))
Region.aver[i + 2*nsamp, "Wales"] = mean(rnorm(n = Region.n3,
mean = Scie.predictor[Region == "Wales", i], sd = Scie.sigma[, i]))
Region.aver[i + 2*nsamp, "Scotland"] = mean(rnorm(n = Region.n4,
mean = Scie.predictor[Region == "Scotland", i], sd = Scie.sigma[, i]))
# Compute the posterior predictive samples of the average PISA score of each UK region
# in the year 2015 or 2018 respectively for each subject
# The computation for math subject
YearReg.aver[i, "England_15"] = mean(rnorm(n = YearReg.n1, mean =
Math.predictor[(Year == 2015) & (Region == "England"), i], sd = Math.sigma[, i]))
YearReg.aver[i, "England_18"] = mean(rnorm(n = YearReg.n2, mean =
Math.predictor[(Year == 2018) & (Region == "England"), i], sd = Math.sigma[, i]))
YearReg.aver[i, "NorthIre_15"] = mean(rnorm(n = YearReg.n3, mean =
Math.predictor[(Year == 2015) & (Region == "NorthIre"), i], sd = Math.sigma[, i]))
YearReg.aver[i, "NorthIre_18"] = mean(rnorm(n = YearReg.n4, mean =
Math.predictor[(Year == 2018) & (Region == "NorthIre"), i], sd = Math.sigma[, i]))
YearReg.aver[i, "Wales_15"] = mean(rnorm(n = YearReg.n5, mean =
Math.predictor[(Year == 2015) & (Region == "Wales"), i], sd = Math.sigma[, i]))
YearReg.aver[i, "Wales_18"] = mean(rnorm(n = YearReg.n6, mean =
Math.predictor[(Year == 2018) & (Region == "Wales"), i], sd = Math.sigma[, i]))
YearReg.aver[i, "Scotland_15"] = mean(rnorm(n = YearReg.n7, mean =
Math.predictor[(Year == 2015) & (Region == "Scotland"), i], sd = Math.sigma[, i]))
YearReg.aver[i, "Scotland_18"] = mean(rnorm(n = YearReg.n8, mean =
Math.predictor[(Year == 2018) & (Region == "Scotland"), i], sd = Math.sigma[, i]))
# The computation for reading subject
YearReg.aver[i + nsamp, "England_15"] = mean(rnorm(n = YearReg.n1, mean =
Read.predictor[(Year == 2015) & (Region == "England"), i], sd = Read.sigma[, i]))
YearReg.aver[i + nsamp, "England_18"] = mean(rnorm(n = YearReg.n2, mean =
Read.predictor[(Year == 2018) & (Region == "England"), i], sd = Read.sigma[, i]))
YearReg.aver[i + nsamp, "NorthIre_15"] = mean(rnorm(n = YearReg.n3, mean =
Read.predictor[(Year == 2015) & (Region == "NorthIre"), i], sd = Read.sigma[, i]))
YearReg.aver[i + nsamp, "NorthIre_18"] = mean(rnorm(n = YearReg.n4, mean =
Read.predictor[(Year == 2018) & (Region == "NorthIre"), i], sd = Read.sigma[, i]))
YearReg.aver[i + nsamp, "Wales_15"] = mean(rnorm(n = YearReg.n5, mean =
Read.predictor[(Year == 2015) & (Region == "Wales"), i], sd = Read.sigma[, i]))
YearReg.aver[i + nsamp, "Wales_18"] = mean(rnorm(n = YearReg.n6, mean =
Read.predictor[(Year == 2018) & (Region == "Wales"), i], sd = Read.sigma[, i]))
YearReg.aver[i + nsamp, "Scotland_15"] = mean(rnorm(n = YearReg.n7, mean =
Read.predictor[(Year == 2015) & (Region == "Scotland"), i], sd = Read.sigma[, i]))
YearReg.aver[i + nsamp, "Scotland_18"] = mean(rnorm(n = YearReg.n8, mean =
Read.predictor[(Year == 2018) & (Region == "Scotland"), i], sd = Read.sigma[, i]))
# The computation for science subject
YearReg.aver[i + 2*nsamp, "England_15"] = mean(rnorm(n = YearReg.n1, mean =
Scie.predictor[(Year == 2015) & (Region == "England"), i], sd = Scie.sigma[, i]))
YearReg.aver[i + 2*nsamp, "England_18"] = mean(rnorm(n = YearReg.n2, mean =
Scie.predictor[(Year == 2018) & (Region == "England"), i], sd = Scie.sigma[, i]))
YearReg.aver[i + 2*nsamp, "NorthIre_15"] = mean(rnorm(n = YearReg.n3, mean =
Scie.predictor[(Year == 2015) & (Region == "NorthIre"), i], sd = Scie.sigma[, i]))
YearReg.aver[i + 2*nsamp, "NorthIre_18"] = mean(rnorm(n = YearReg.n4, mean =
Scie.predictor[(Year == 2018) & (Region == "NorthIre"), i], sd = Scie.sigma[, i]))
YearReg.aver[i + 2*nsamp, "Wales_15"] = mean(rnorm(n = YearReg.n5, mean =
Scie.predictor[(Year == 2015) & (Region == "Wales"), i], sd = Scie.sigma[, i]))
YearReg.aver[i + 2*nsamp, "Wales_18"] = mean(rnorm(n = YearReg.n6, mean =
Scie.predictor[(Year == 2018) & (Region == "Wales"), i], sd = Scie.sigma[, i]))
YearReg.aver[i + 2*nsamp, "Scotland_15"] = mean(rnorm(n = YearReg.n7, mean =
Scie.predictor[(Year == 2015) & (Region == "Scotland"), i], sd = Scie.sigma[, i]))
YearReg.aver[i + 2*nsamp, "Scotland_18"] = mean(rnorm(n = YearReg.n8, mean =
Scie.predictor[(Year == 2018) & (Region == "Scotland"), i], sd = Scie.sigma[, i]))
}
gc(rm(Math.predictor)); gc(rm(Math.sigma))
gc(rm(Read.predictor)); gc(rm(Read.sigma))
gc(rm(Scie.predictor)); gc(rm(Scie.sigma))
detach(GB.df)
# Visualize the posterior predictive density of the average PISA score of
# each subject for each group
# Visualize the posterior predictive density of the average PISA score of the
# whole UK in the year 2015 and 2018 for each subject respectively
Year.aver %>% pivot_longer(cols = Year15:Year18,
names_to = "Year", values_to = "Score") %>%
ggplot(aes(x = Score, color = Year)) +
geom_density() + facet_wrap(~Subject, ncol = 3) +
labs(x = "PISA Score", y = "Density",
title = paste("Posterior predictive distribution of the average\n",
"PISA score in the year 2015 or 2018", sep = "")) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 14),
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12), legend.position = "top")
ggsave("P4F6.png", width = 8, height = 5)
# Visualize the posterior predictive density of the average PISA score of each region in
# the UK (combining the data of the year 2015 and 2018) for each subject respectively
Region.aver %>% pivot_longer(cols = England:Scotland,
names_to = "Region", values_to = "Score") %>%
ggplot(aes(x = Score, color = Region)) +
geom_density() + facet_wrap(~Subject, ncol = 3) +
labs(x = "PISA Score", y = "Density",
title = paste("Posterior predictive distribution of the average\n",
"PISA score in each region of the UK", sep = "")) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 14),
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12), legend.position = "top")
ggsave("P4F7.png", width = 8, height = 5)
# Visualize the posterior predictive density of the average PISA score of each UK region
# in the year 2015 or 2018 for each subject respectively
Year.labs = c("Year 2015", "Year 2018"); names(Year.labs) = c("15", "18")
YearReg.aver %>% pivot_longer(cols = England_15:Scotland_18,
names_to = c("Region", "Year"), names_sep = "_", values_to = "Score") %>%
ggplot(aes(x = Score, color = Region)) + geom_density() +
facet_grid(Year ~ Subject, labeller = labeller(Year = Year.labs)) +
labs(x = "PISA Score", y = "Density",
title = paste("Posterior predictive distribution of the average PISA score\n",
"in each UK region and the year 2015 or 2018", sep = "")) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
strip.text = element_text(size = 12, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 14),
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12), legend.position = "top")
ggsave("P4F8.png", width = 8, height = 6)
In this section, I am going to compare the pressure of competition on students between the UK and Italy and its effects on PISA scores in these two countries.
Firstly, I load the data set Competition18.RData which is extracted from the original data file of PISA.
You need to upload this data set to the following path on Kaggle:
"../input/competition/Competition18.RData"
Then, I perform some operations on this data set and draw a boxplot to compare the overall competition pressure on students between the UK and Italy.
# Load the data set `Competition18.RData`, which records the information
# about the competition pressure on students in the UK and Italy
load("../input/competition/Competition18.RData")
summary(Competition18)
# Perform some operations on this data set
Compe18.data = Competition18 %>%
mutate(Competition = ST205Q01HA + ST205Q02HA + ST205Q03HA + ST205Q04HA - 4,
GBR.Indi = as.integer(CNT == "GBR"),
CNT = ifelse(CNT == "GBR", "British", "Italy")) %>%
select(!c(ST205Q01HA, ST205Q02HA, ST205Q03HA, ST205Q04HA)) %>%
rename(Country = CNT) %>%
filter(!is.na(Competition))
summary(Compe18.data)
# Draw a boxplot to compare the overall competition pressure on students
# between the UK and Italy
Compe18.data %>% ggplot(aes(x = Country, y = Competition)) + geom_boxplot() +
labs(title = "The pressure of competition on\nstudents in the UK and Italy") +
ylab("Level of competition pressure") +
scale_y_continuous(breaks = seq(0, 12, 2)) +
theme(axis.title = element_text(size = 10), axis.text = element_text(size = 10),
plot.title = element_text(hjust = 0.5, size = 12))
ggsave("P5F1.png", width = 4, height = 4)
1. Conduct frequentist analyses to explore the relationship between PISA scores and competition pressure levels in British and Italy
(1) Plot the scores of the three subjects against competition pressure levels respectively
(2) Construct generalized linear models to predict PISA scores based on country and competition pressure level
(3) Visualize the results of the GLMs for the three subjects
# Plot the scores of the three subjects against competition pressure levels
# Plot the math scores against competition pressure levels
Plot.math = Compe18.data %>%
ggplot(aes(x = Competition, y = get(math))) +
geom_point(aes(color = Country)) +
xlab("Level of competition pressure") + ylab("Math score") +
labs(title = "Trend of math score with competition\npressure in the UK and Italy") +
scale_x_continuous(breaks = 0:12) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 13),
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12), legend.position = "top")
Plot.math
ggsave("P5F2(1).png", width = 4, height = 5)
# Plot the reading scores against competition pressure levels
Plot.reading = Compe18.data %>%
ggplot(aes(x = Competition, y = get(reading))) +
geom_point(aes(color = Country)) +
xlab("Level of competition pressure") + ylab("Reading score") +
labs(title = "Trend of reading score with competition\npressure in the UK and Italy") +
scale_x_continuous(breaks = 0:12) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 13),
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12), legend.position = "top")
Plot.reading
ggsave("P5F2(2).png", width = 4, height = 5)
# Plot the science scores against competition pressure levels
Plot.science = Compe18.data %>%
ggplot(aes(x = Competition, y = get(science))) +
geom_point(aes(color = Country)) +
xlab("Level of competition pressure") + ylab("Science score") +
labs(title = "Trend of science score with competition\npressure in the UK and Italy") +
scale_x_continuous(breaks = 0:12) +
theme(axis.title = element_text(size = 12), axis.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 13),
legend.title = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12), legend.position = "top")
Plot.science
ggsave("P5F2(3).png", width = 4, height = 5)
# Center the variable of competition pressure levels and set the variable
# `Country` as factor type (set Italy as the baseline class)
compe.mean = mean(Compe18.data$Competition)
Compe18.center = Compe18.data %>%
mutate(Competition = Competition - mean(Competition),
Country = factor(Country, levels = c("Italy", "British")))
# Create the data ponits based on which we will predict the PISA scores
# and build confidence intervals
breaks = seq(0, 12, 0.1); n = length(breaks)
data.predict = data.frame(
Country = c(rep("British", n), rep("Italy", n)),
Competition = rep(breaks, 2) - compe.mean)
# Construct generalized linear models to predict PISA scores
# based on country and competition pressure level
# The model to predict math scores based on country and competition pressure level
Compe18.fit.math = lm(get(math) ~ Country + Competition + Country:Competition,
data = Compe18.center)
summary(Compe18.fit.math)
confint(Compe18.fit.math)
par(mfrow = c(2, 2))
plot(rstudent(Compe18.fit.math), ylab = "Standardized Residuals",
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2,
main = "Standardized residuals vs Index", col = "blue")
plot(Compe18.fit.math, col = "blue", sub = "", cex.main = 1.2,
cex.lab = 1.15, cex.axis = 1.2, lwd = 2, which = 1:3)
coeffi = Compe18.fit.math$coefficients
ITA.inter = coeffi["(Intercept)"]; ITA.slope = coeffi["Competition"]
GBR.inter = ITA.inter + coeffi["CountryBritish"]
GBR.slope = ITA.slope + coeffi["CountryBritish:Competition"]
prediction = data.frame(
predict(Compe18.fit.math, newdata = data.predict, interval = "confidence"))
prediction = cbind(data.predict, prediction) %>% rename(PV2MATH = fit) %>%
mutate(Competition = Competition + compe.mean)
# Visualize the results of the GLM for math subject
Plot.math +
geom_abline(intercept = GBR.inter - GBR.slope*compe.mean,
slope = GBR.slope, color = "red") +
geom_abline(intercept = ITA.inter - ITA.slope*compe.mean,
slope = ITA.slope, color = "blue") +
geom_ribbon(aes(ymin = lwr, ymax = upr, group = Country),
data = prediction, fill = "gray", alpha = 0.6)
ggsave("P5F3(1).png", width = 4, height = 5)
# The model to predict reading scores based on country and competition pressure level
Compe18.fit.reading = lm(get(reading) ~ Country + Competition + Country:Competition,
data = Compe18.center)
summary(Compe18.fit.reading)
confint(Compe18.fit.reading)
plot(rstudent(Compe18.fit.reading), ylab = "Standardized Residuals",
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2,
main = "Standardized residuals vs Index", col = "blue")
plot(Compe18.fit.reading, col = "blue", sub = "", cex.main = 1.2,
cex.lab = 1.15, cex.axis = 1.2, lwd = 2, which = 1:3)
coeffi = Compe18.fit.reading$coefficients
ITA.inter = coeffi["(Intercept)"]; ITA.slope = coeffi["Competition"]
GBR.inter = ITA.inter + coeffi["CountryBritish"]
GBR.slope = ITA.slope + coeffi["CountryBritish:Competition"]
prediction = data.frame(
predict(Compe18.fit.reading, newdata = data.predict, interval = "confidence"))
prediction = cbind(data.predict, prediction) %>% rename(PV2READ = fit) %>%
mutate(Competition = Competition + compe.mean)
# Visualize the results of the GLM for reading subject
Plot.reading +
geom_abline(intercept = GBR.inter - GBR.slope*compe.mean,
slope = GBR.slope, color = "red") +
geom_abline(intercept = ITA.inter - ITA.slope*compe.mean,
slope = ITA.slope, color = "blue") +
geom_ribbon(aes(ymin = lwr, ymax = upr, group = Country),
data = prediction, fill = "gray", alpha = 0.6)
ggsave("P5F3(2).png", width = 4, height = 5)
# The model to predict science scores based on country and competition pressure level
Compe18.fit.science = lm(get(science) ~ Country + Competition + Country:Competition,
data = Compe18.center)
summary(Compe18.fit.science)
confint(Compe18.fit.science)
plot(rstudent(Compe18.fit.science), ylab = "Standardized Residuals",
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2,
main = "Standardized residuals vs Index", col = "blue")
plot(Compe18.fit.science, col = "blue", sub = "", cex.main = 1.2,
cex.lab = 1.15, cex.axis = 1.2, lwd = 2, which = 1:3)
coeffi = Compe18.fit.science$coefficients
ITA.inter = coeffi["(Intercept)"]; ITA.slope = coeffi["Competition"]
GBR.inter = ITA.inter + coeffi["CountryBritish"]
GBR.slope = ITA.slope + coeffi["CountryBritish:Competition"]
prediction = data.frame(
predict(Compe18.fit.science, newdata = data.predict, interval = "confidence"))
prediction = cbind(data.predict, prediction) %>% rename(PV2SCIE = fit) %>%
mutate(Competition = Competition + compe.mean)
# Visualize the results of the GLM for science subject
Plot.science +
geom_abline(intercept = GBR.inter - GBR.slope*compe.mean,
slope = GBR.slope, color = "red") +
geom_abline(intercept = ITA.inter - ITA.slope*compe.mean,
slope = ITA.slope, color = "blue") +
geom_ribbon(aes(ymin = lwr, ymax = upr, group = Country),
data = prediction, fill = "gray", alpha = 0.6)
ggsave("P5F3(3).png", width = 4, height = 5)
graphics.off()
2. Conduct Bayesian analyses to explore the relationship between PISA scores and competition pressure levels in British and Italy
(1) Specify the stucture of the Bayesian (JAGS) model for the three subjects
(2) Construct the Bayesian (JAGS) model for each subject respectively and sample from the posterior distributions of the model parameters
(3) Visualize the relationship between PISA scores and competition pressure on students in British and Italy
# The string of model statement
Compe18.Bayes.model = "model{
# The likelihood
for (i in 1:n){
Score[i] ~ dnorm(mu[i], tau)
mu[i] <- Intercept + British*GBR.Indi[i] + Compe*Competition[i] +
GBR.Compe*(GBR.Indi[i]*Competition[i])
}
# The priors
Intercept ~ dnorm(0, tau.beta)
British ~ dnorm(0, tau.beta)
Compe ~ dnorm(0, tau.beta)
GBR.Compe ~ dnorm(0, tau.beta)
sigma ~ dlnorm(ln.mu, ln.prec)
tau <- 1/pow(sigma, 2)
# The hyperparameters
tau.beta <- 0.01
ln.prec <- 0.1
ln.mu <- log(100) - 1/(2*ln.prec)
}
"
# Specify initial values for the parameters and prepare the data for the model
Compe18.Bayes.inits = function(){
inits = list(Intercept = rnorm(1, 0, 1), British = rnorm(1, 0, 1),
Compe = rnorm(1, 0, 1), GBR.Compe = rnorm(1, 0, 1),
sigma = rlnorm(1, log(100), 1))
return(inits)
}
Compe18.Bayes.data = list(
n = nrow(Compe18.center), Score = numeric(nrow(Compe18.center)),
GBR.Indi = Compe18.center$GBR.Indi, Competition = Compe18.center$Competition)
# Construct Bayesian (JAGS) models for the three subjects respectively and
# visualize the results of predicting the scores of these subjects using the
# Bayesian models separately
# Compile the JAGS model for math subject and sample from the
# posterior distributions of the parameters
Compe18.math.data = Compe18.Bayes.data
Compe18.math.data$Score = Compe18.data[, math]
Compe18.math.model = jags.model(
file = textConnection(Compe18.Bayes.model), data = Compe18.math.data,
inits = Compe18.Bayes.inits, n.chains = 2)
update(Compe18.math.model, n.iter = 5000)
Compe18.math.sample = coda.samples(
Compe18.math.model, n.iter = 15000, thin = 5,
variable.names = c("Intercept", "British", "Compe", "GBR.Compe", "sigma"))
# Perform model diagnostics and display the summary statistics
plot(Compe18.math.sample)
gelman.plot(Compe18.math.sample)
autocorr.plot(Compe18.math.sample)
effectiveSize(Compe18.math.sample)
summary(Compe18.math.sample)
# Visualize the results of the Bayesian model for math subject
Math.sample.df = do.call(rbind.data.frame, Compe18.math.sample)
nsamp = nrow(Math.sample.df)
prediction = data.predict %>%
mutate(GBR.Indi = as.integer(Country == "British"),
lower = numeric(2*n), upper = numeric(2*n), PV2MATH = numeric(2*n))
for (i in 1:(2*n)){
sample = as.numeric(
Math.sample.df$Intercept + (Math.sample.df$British)*(prediction$GBR.Indi[i]) +
(Math.sample.df$Compe)*(prediction$Competition[i]) +
(Math.sample.df$GBR.Compe)*(prediction$GBR.Indi[i])*(prediction$Competition[i]))
prediction[i, "lower"] = quantile(sample, 0.025)
prediction[i, "upper"] = quantile(sample, 0.975)
prediction[i, "PV2MATH"] = mean(sample)
}
prediction = prediction %>% select(!GBR.Indi) %>%
mutate(Competition = Competition + compe.mean)
coeffi = summary(Compe18.math.sample)[[1]][, "Mean"]
ITA.inter = coeffi["Intercept"]; ITA.slope = coeffi["Compe"]
GBR.inter = ITA.inter + coeffi["British"]
GBR.slope = ITA.slope + coeffi["GBR.Compe"]
Plot.math +
geom_abline(intercept = GBR.inter - GBR.slope*compe.mean,
slope = GBR.slope, color = "red") +
geom_abline(intercept = ITA.inter - ITA.slope*compe.mean,
slope = ITA.slope, color = "blue") +
geom_ribbon(aes(ymin = lower, ymax = upper, group = Country),
data = prediction, fill = "gray", alpha = 0.6)
ggsave("P5F4(1).png", width = 4, height = 5)
# Compile the JAGS model for reading subject and sample from the
# posterior distributions of the parameters
Compe18.reading.data = Compe18.Bayes.data
Compe18.reading.data$Score = Compe18.data[, reading]
Compe18.reading.model = jags.model(
file = textConnection(Compe18.Bayes.model), data = Compe18.reading.data,
inits = Compe18.Bayes.inits, n.chains = 2)
update(Compe18.reading.model, n.iter = 5000)
Compe18.reading.sample = coda.samples(
Compe18.reading.model, n.iter = 15000, thin = 5,
variable.names = c("Intercept", "British", "Compe", "GBR.Compe", "sigma"))
# Perform model diagnostics and display the summary statistics
plot(Compe18.reading.sample)
gelman.plot(Compe18.reading.sample)
autocorr.plot(Compe18.reading.sample)
effectiveSize(Compe18.reading.sample)
summary(Compe18.reading.sample)
# Visualize the results of the Bayesian model for reading subject
Reading.sample.df = do.call(rbind.data.frame, Compe18.reading.sample)
nsamp = nrow(Reading.sample.df)
prediction = data.predict %>%
mutate(GBR.Indi = as.integer(Country == "British"),
lower = numeric(2*n), upper = numeric(2*n), PV2READ = numeric(2*n))
for (i in 1:(2*n)){
sample = as.numeric(
Reading.sample.df$Intercept +
(Reading.sample.df$British)*(prediction$GBR.Indi[i]) +
(Reading.sample.df$Compe)*(prediction$Competition[i]) +
(Reading.sample.df$GBR.Compe)*(prediction$GBR.Indi[i])*
(prediction$Competition[i]))
prediction[i, "lower"] = quantile(sample, 0.025)
prediction[i, "upper"] = quantile(sample, 0.975)
prediction[i, "PV2READ"] = mean(sample)
}
prediction = prediction %>% select(!GBR.Indi) %>%
mutate(Competition = Competition + compe.mean)
coeffi = summary(Compe18.reading.sample)[[1]][, "Mean"]
ITA.inter = coeffi["Intercept"]; ITA.slope = coeffi["Compe"]
GBR.inter = ITA.inter + coeffi["British"]
GBR.slope = ITA.slope + coeffi["GBR.Compe"]
Plot.reading +
geom_abline(intercept = GBR.inter - GBR.slope*compe.mean,
slope = GBR.slope, color = "red") +
geom_abline(intercept = ITA.inter - ITA.slope*compe.mean,
slope = ITA.slope, color = "blue") +
geom_ribbon(aes(ymin = lower, ymax = upper, group = Country),
data = prediction, fill = "gray", alpha = 0.6)
ggsave("P5F4(2).png", width = 4, height = 5)
# Compile the JAGS model for science subject and sample from the
# posterior distributions of the parameters
Compe18.science.data = Compe18.Bayes.data
Compe18.science.data$Score = Compe18.data[, science]
Compe18.science.model = jags.model(
file = textConnection(Compe18.Bayes.model), data = Compe18.science.data,
inits = Compe18.Bayes.inits, n.chains = 2)
update(Compe18.science.model, n.iter = 5000)
Compe18.science.sample = coda.samples(
Compe18.science.model, n.iter = 15000, thin = 5,
variable.names = c("Intercept", "British", "Compe", "GBR.Compe", "sigma"))
# Perform model diagnostics and display the summary statistics
plot(Compe18.science.sample)
gelman.plot(Compe18.science.sample)
autocorr.plot(Compe18.science.sample)
effectiveSize(Compe18.science.sample)
summary(Compe18.science.sample)
# Visualize the results of the Bayesian model for science subject
Science.sample.df = do.call(rbind.data.frame, Compe18.science.sample)
nsamp = nrow(Science.sample.df)
prediction = data.predict %>%
mutate(GBR.Indi = as.integer(Country == "British"),
lower = numeric(2*n), upper = numeric(2*n), PV2SCIE = numeric(2*n))
for (i in 1:(2*n)){
sample = as.numeric(
Science.sample.df$Intercept +
(Science.sample.df$British)*(prediction$GBR.Indi[i]) +
(Science.sample.df$Compe)*(prediction$Competition[i]) +
(Science.sample.df$GBR.Compe)*(prediction$GBR.Indi[i])*
(prediction$Competition[i]))
prediction[i, "lower"] = quantile(sample, 0.025)
prediction[i, "upper"] = quantile(sample, 0.975)
prediction[i, "PV2SCIE"] = mean(sample)
}
prediction = prediction %>% select(!GBR.Indi) %>%
mutate(Competition = Competition + compe.mean)
coeffi = summary(Compe18.science.sample)[[1]][, "Mean"]
ITA.inter = coeffi["Intercept"]; ITA.slope = coeffi["Compe"]
GBR.inter = ITA.inter + coeffi["British"]
GBR.slope = ITA.slope + coeffi["GBR.Compe"]
Plot.science +
geom_abline(intercept = GBR.inter - GBR.slope*compe.mean,
slope = GBR.slope, color = "red") +
geom_abline(intercept = ITA.inter - ITA.slope*compe.mean,
slope = ITA.slope, color = "blue") +
geom_ribbon(aes(ymin = lower, ymax = upper, group = Country),
data = prediction, fill = "gray", alpha = 0.6)
ggsave("P5F4(3).png", width = 4, height = 5)